/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the datasets that are to be fed into R 
					to create the ensemble predictions for the expanded models.
					
					This dataset saves the data for every year separately.
				
*******************************************************************************/

clear all
global id_code 002

************************************************************************
* 1a. Expanded model: Preparation to split the dataset
************************************************************************

* Importing data
use "${data}/001_9_FinalMainDataset_expanded.dta", clear

* Generating random variable so that training sample is a random sample
set seed 2111
gen double random = runiform()
isid random // Ensure random gives a unique ordering of data
sort random
drop random
gen n = _n

* Creating dummy variables for non-ordinal categorical variables
foreach var in L_Industry_3digit L_civilStatus EducLevel L_Municipality L_Age_Youngest citizenship migrationCohort L_Occupation_3D_L1L2 {
	tab `var', gen(`var'_dummy)
	drop `var'
}

************************************************************************
* 1b. Expanded models: full sample (imputing + missing dummy)
************************************************************************

* Fix variable sets:

global wealth "L_NetWealth* L_Liabilities* L_BankAccount* L_RealEstate*"

global UI "L_additionalUI*"

global occup "L_Occupation_3D_L1L2*"

global levelUI "replacRatio_new*"

global IQ "cognit_dummy* non_cognit_dummy*"	

global union "L_unionMember*"

* Run the loop:
foreach model in EX_FullSample_Baseline EX_FullSample_UI EX_FullSample_WE EX_FullSample_OC ///
	EX_FullSample_RR EX_FullSample_IQ EX_FullSample_UM EX_FullSample_ALL ///
	 EX_Ba_FullSample_Baseline EX_Ba_FullSample_UI EX_Ba_FullSample_WE EX_Ba_FullSample_OC ///
	EX_Ba_FullSample_RR EX_Ba_FullSample_IQ EX_Ba_FullSample_UM EX_Ba_FullSample_ALL /// 
	 {

		************************************************************************
		* Define the year span and set of variables for each model
		************************************************************************
		
		local sampleName Full
		local yearSpan 2006/2006
		
		
		if "`model'"=="EX_FullSample_Baseline" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist"
		}
		
		if "`model'"=="EX_FullSample_OC" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist $occup"
		}
		
		if "`model'"=="EX_FullSample_WE" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist $wealth"
		}
		
		if "`model'"=="EX_FullSample_UI" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist $UI"
		}
		
		if "`model'"=="EX_FullSample_RR" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist $levelUI"
		}
		
		if "`model'"=="EX_FullSample_IQ" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist $IQ"
		}
		
		if "`model'"=="EX_FullSample_UM" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist $union"
		}
		
		if "`model'"=="EX_FullSample_ALL" {
		local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist $union $IQ $levelUI $UI $wealth $occup"
		}
		
		if "`model'"=="EX_Ba_FullSample_Baseline" {
		local varSet "$demoEdu"
		}
		
		if "`model'"=="EX_Ba_FullSample_OC" {
		local varSet "$demoEdu $occup"
		}
		
		if "`model'"=="EX_Ba_FullSample_WE" {
		local varSet "$demoEdu $wealth"
		}
		
		if "`model'"=="EX_Ba_FullSample_UI" {
		local varSet "$demoEdu $UI"
		}
		
		if "`model'"=="EX_Ba_FullSample_RR" {
		local varSet "$demoEdu $levelUI"
		}
		
		if "`model'"=="EX_Ba_FullSample_IQ" {
		local varSet "$demoEdu $IQ"
		}
		
		if "`model'"=="EX_Ba_FullSample_UM" {
		local varSet "$demoEdu $union"
		}
		
		if "`model'"=="EX_Ba_FullSample_ALL" {
		local varSet "$demoEdu $union $IQ $levelUI $UI $wealth $occup"
		}
		
		
		************************************************************************
		* Keep only observations from a particular year
		************************************************************************

		forval y = `yearSpan' {

			preserve

			gen temp = 0
			
			forvalues i = 0(3)12{

				replace emplAft3M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y'),	///
					d(31dec`y'))
				replace emplAft6M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y'),	///
					d(31dec`y'))
					
				replace temp = 1 if emplAft3M_`i'M_In!=.
			}	
			
			replace emplAft12M_0M_In = . if !inrange(startU, d(01jan`y'), d(31dec`y'))
			
			drop if temp==0
			drop temp
			
			************************************************************************
			* Keep only observations and variables that belong to the sample
			************************************************************************
			* We keep only observations in the baseline (Full) sample:
			keep if inSample_`sampleName' == 1
			
			* Keep only variables that belong to the model
			keep LopNr_PersonNr InLnr `varSet' inSample_`sampleName' emplAft* n 
			
			************************************************************************
			* Prepare and save the sample
			************************************************************************
			* Generate variable used to split dataset into parameter tuning, training
			* and prediction partitions (we ensure the sorting is the same as in the
			* baseline models by merging with that data set):
			sort n
			gen n_order = _n	

			rename inSample_`sampleName' inSample
			
			compress
			
			drop n_order n
			
			merge 1:1 LopNr_PersonNr InLnr using "${data}/002_DataForR_Full_2006.dta", ///
				keepusing(n n_order) nogen /* assert(3) */ keep(3)
			
			sort n_order
			
			* The last variables should be covariates starting from gender 
			* This is used to tell R which variables should be included in the model
			
			order LopNr_PersonNr InLnr n n_order inSample emplAft* Gender, first
			
			save "${data}/${id_code}_DataForR_`model'_`y'.dta", replace
		
			restore
	
	
	}
}
